# coding=utf8

import requests
import pymongo
from lxml import etree
import re
import json
import aiohttp
import fake_useragent
import datetime
from requests import Session
import time

location = 'fake_useragent_0.1.10.json'

ua = fake_useragent.UserAgent(path=location)

from urllib.parse import quote_plus
from pymongo import MongoClient

uri = "mongodb://%s:%s@%s" % (quote_plus('root'), quote_plus('root'), '127.0.0.1')

# uri = "mongodb://%s:%s@%s" % (quote_plus('zane'), quote_plus('*#06#'), '121.9.245.183')
client = MongoClient(uri)
db = client['Homestay']
xzdz = db['xiaozhu']
xzdz_house_list = db['xiaozhuhouselist']
xzdz_pagesource = db['xiaozhupagesource']

list_distric = ["南沙", "番禺", "天河", "海珠", "越秀", "荔湾", "从化", "白云", "花都", "黄埔", "增城"]
dict_district = {'从化': 'conghua', '南沙': 'nansha', '增城': 'zengcheng', '天河': 'tianhe', '海珠': 'haizhu', '番禺': 'fanyu', '白云': 'baiyun', '花都': 'huadu', '荔湾': 'liwan', '越秀': 'yuexiu', '黄埔': 'huangpu'}

booking_xpath_today = "//span[contains(text(),'今天')]/child::span"


# 找到各个行政区的入口页面url  遍历各个区的所有列表页，找到每一间民宿的href，存入数据库

def get_hrefs():
    for distric in list_distric:
        count = 1
        print("now is {}".format(distric))
        while True:
            page_count = "p" + str(count) + "-" if count > 1 else ""
            entry_page = "https://gz.xiaozhu.com/{}-duanzufang-{}8/".format(dict_district[distric], page_count)
            distric_page_res = requests.get(entry_page, headers={'user-agent': ua.random})
            entry_page_html = etree.HTML(distric_page_res.text)
            list_houses = []
            for house_html in entry_page_html.xpath("//a[@class='resule_img_a' and @target='_blank']"):
                list_houses.append(house_html.attrib['href'])
            dict_list_houses = {"district": distric, "hrefs": list_houses, "insert_time": datetime.datetime.strptime(str(datetime.datetime.now())[:-7], "%Y-%m-%d  %H:%M:%S")}
            xzdz_house_list.insert_one(dict_list_houses)
            print("insert one list into mongo")
            count += 1
            if len(entry_page_html.xpath("//*[text()='>']")) == 0:
                print("一個區的鏈接爬完了：{}".format(distric))
                break
        # res = requests.get("https://gz.xiaozhu.com/fanyu-duanzufang-p4-8/", headers={'user-agent': ua.random})


if __name__ == "__main__":
    get_hrefs()
